/* * ScreenSlicer (TM) * Copyright (C) 2013-2015 Machine Publishers, LLC * ops@machinepublishers.com | screenslicer.com | machinepublishers.com * Cincinnati, Ohio, USA * * You can redistribute this program and/or modify it under the terms of the GNU Affero General Public * License version 3 as published by the Free Software Foundation. * * "ScreenSlicer", "jBrowserDriver", "Machine Publishers", and "automatic, zero-config web scraping" * are trademarks of Machine Publishers, LLC. * * This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without * even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Affero General Public License version 3 for more details. * * You should have received a copy of the GNU Affero General Public License version 3 along with this * program. If not, see http://www.gnu.org/licenses/ * * For general details about how to investigate and report license violations, please see * https://www.gnu.org/licenses/gpl-violation.html and email the author, ops@machinepublishers.com */ package com.screenslicer.core.scrape; import java.io.File; import java.io.IOException; import java.util.ArrayList; import java.util.Collection; import java.util.HashMap; import java.util.LinkedHashMap; import java.util.List; import java.util.Map; import java.util.concurrent.TimeUnit; import java.util.concurrent.atomic.AtomicBoolean; import java.util.concurrent.atomic.AtomicReference; import org.apache.commons.codec.binary.Base64; import org.apache.commons.io.FileUtils; import org.apache.tika.Tika; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.nodes.Node; import org.openqa.selenium.OutputType; import com.google.common.io.Files; import com.machinepublishers.browser.Browser; import com.machinepublishers.jbrowserdriver.JBrowserDriver; import com.machinepublishers.jbrowserdriver.ProxyConfig; import com.machinepublishers.jbrowserdriver.ProxyConfig.Type; import com.machinepublishers.jbrowserdriver.RequestHeaders; import com.machinepublishers.jbrowserdriver.Settings; import com.screenslicer.api.datatype.HtmlNode; import com.screenslicer.api.datatype.Proxy; import com.screenslicer.api.datatype.Result; import com.screenslicer.api.datatype.UrlTransform; import com.screenslicer.api.request.Fetch; import com.screenslicer.api.request.FormLoad; import com.screenslicer.api.request.FormQuery; import com.screenslicer.api.request.KeywordQuery; import com.screenslicer.api.request.Query; import com.screenslicer.api.request.Request; import com.screenslicer.common.CommonUtil; import com.screenslicer.common.Log; import com.screenslicer.common.Random; import com.screenslicer.core.scrape.Proceed.End; import com.screenslicer.core.scrape.neural.NeuralNetManager; import com.screenslicer.core.scrape.type.SearchResults; import com.screenslicer.core.service.ScreenSlicerBatch; import com.screenslicer.core.util.BrowserUtil; import com.screenslicer.core.util.NodeUtil; import com.screenslicer.core.util.UrlUtil; import com.screenslicer.webapp.WebApp; import de.l3s.boilerpipe.extractors.NumWordsRulesExtractor; public class Scrape { public static class ActionFailed extends Exception { private static final long serialVersionUID = 1L; public ActionFailed() { super(); } public ActionFailed(Throwable nested) { super(nested); Log.exception(nested); } public ActionFailed(String message) { super(message); } } public static class Cancelled extends Exception { } private static final AtomicReference<Browser[]> browsers = new AtomicReference<Browser[]>(new Browser[WebApp.THREADS]); private static final int MAX_INIT = 1000; private static final int HANG_TIME = 10 * 60 * 1000; private static final long WAIT = 2000; private static final Object cacheLock = new Object(); private static final AtomicReference<Map<String, List>> nextResults = new AtomicReference<Map<String, List>>(new HashMap<String, List>()); private static final AtomicReference<List<String>> cacheKeys = new AtomicReference<List<String>>(new ArrayList<String>()); private static final int LIMIT_CACHE = 5000; private static final int MAX_CACHE = 500; private static final int CLEAR_CACHE = 250; private static final boolean[] done = new boolean[WebApp.THREADS]; private static final Object doneLock = new Object(); private static final Object progressLock = new Object(); private static String progress1Key = ""; private static String progress2Key = ""; private static String progress1 = ""; private static String progress2 = ""; public static final List<Result> WAITING = new ArrayList<Result>(); public static void init() { for (int i = 0; i < WebApp.THREADS; i++) { NeuralNetManager.reset(new File("./resources/neural/config"), i); start(new Request(), false, i); synchronized (doneLock) { done[i] = true; } } } private static void start(Request req, boolean media, int threadNum) { Type proxyType = null; String proxyHost = null; int proxyPort = -1; String proxyUser = null; String proxyPassword = null; Proxy[] proxies = CommonUtil.isEmpty(req.proxies) ? new Proxy[] { req.proxy } : req.proxies; for (int curProxy = 0; curProxy < proxies.length; curProxy++) { Proxy proxy = proxies[curProxy]; if (proxy != null) { proxyType = proxy.type.equals(Proxy.TYPE_SOCKS) || proxy.type.equals(Proxy.TYPE_ALL) || proxy.type.equals(Proxy.TYPE_SOCKS_4) || proxy.type.equals(Proxy.TYPE_SOCKS_5) ? Type.SOCKS : (proxy.type.equals(Proxy.TYPE_HTTP) || proxy.type.equals(Proxy.TYPE_SSL) ? Type.HTTP : null); proxyHost = proxy.ip; proxyPort = proxy.port; if (!CommonUtil.isEmpty(proxy.username) || !CommonUtil.isEmpty(proxy.password)) { proxyUser = proxy.username == null ? "" : proxy.username; proxyPassword = proxy.password == null ? "" : proxy.password; } } } File downloadCache = new File("./download_cache" + threadNum); FileUtils.deleteQuietly(downloadCache); downloadCache.mkdir(); File mediaCache = null; if (media) { mediaCache = new File("./media_cache" + threadNum); FileUtils.deleteQuietly(mediaCache); mediaCache.mkdir(); } browsers.get()[threadNum] = new JBrowserDriver( new Settings.Builder(). requestHeaders(req.httpHeaders == null ? null : new RequestHeaders(new LinkedHashMap<String, String>(req.httpHeaders))). proxy(new ProxyConfig(proxyType, proxyHost, proxyPort, proxyUser, proxyPassword)). downloadDir(downloadCache). mediaDir(mediaCache).build()); browsers.get()[threadNum].manage().timeouts().pageLoadTimeout(req.timeout, TimeUnit.SECONDS); browsers.get()[threadNum].manage().timeouts().setScriptTimeout(req.timeout, TimeUnit.SECONDS); browsers.get()[threadNum].manage().timeouts().implicitlyWait(0, TimeUnit.SECONDS); } public static void forceQuit(int threadNum) { try { if (browsers.get()[threadNum] != null) { browsers.get()[threadNum].kill(); } } catch (Throwable t) { Log.exception(t); } } private static void restart(Request req, boolean media, int threadNum) { try { forceQuit(threadNum); } catch (Throwable t) { Log.exception(t); } try { browsers.get()[threadNum] = null; } catch (Throwable t) { Log.exception(t); } start(req, media, threadNum); } private static void push(String mapKey, List results) { synchronized (cacheLock) { nextResults.get().put(mapKey, results); if (nextResults.get().size() == LIMIT_CACHE) { List<String> toRemove = new ArrayList<String>(); for (Map.Entry<String, List> entry : nextResults.get().entrySet()) { if (!cacheKeys.get().contains(entry.getKey()) && !entry.getKey().equals(mapKey)) { toRemove.add(entry.getKey()); } } for (String key : toRemove) { nextResults.get().remove(key); } nextResults.get().put(mapKey, results); } if (results != null && !results.isEmpty()) { if (cacheKeys.get().size() == MAX_CACHE) { List<String> newCache = new ArrayList<String>(); for (int i = 0; i < CLEAR_CACHE; i++) { nextResults.get().remove(cacheKeys.get().get(i)); } for (int i = CLEAR_CACHE; i < MAX_CACHE; i++) { newCache.add(cacheKeys.get().get(i)); } cacheKeys.set(newCache); } cacheKeys.get().add(mapKey); } } } public static List<Result> cached(String mapKey) { synchronized (cacheLock) { if (nextResults.get().containsKey(mapKey)) { List<Result> ret = nextResults.get().get(mapKey); if (ret == null) { return WAITING; } return ret; } else { return null; } } } public static boolean busy() { synchronized (doneLock) { for (int i = 0; i < WebApp.THREADS; i++) { if (done[i]) { return false; } } } return true; } public static String progress(String mapKey) { synchronized (progressLock) { if (progress1Key.equals(mapKey)) { return progress1; } if (progress2Key.equals(mapKey)) { return progress2; } return ""; } } private static String toCacheUrl(String url, boolean fallback) { if (url == null) { return null; } if (fallback) { return "http://webcache.googleusercontent.com/search?q=cache:" + url.split("://")[1]; } String[] urlParts = url.split("://")[1].split("/", 2); String urlLhs = urlParts[0]; String urlRhs = urlParts.length > 1 ? urlParts[1] : ""; return "http://" + urlLhs + ".nyud.net:8080/" + urlRhs; } private static class DownloadedFiles { String content; String mimeType; String extension; String filename; public DownloadedFiles(int thread) { File file = new File("./download_cache" + thread); Collection<File> list = FileUtils.listFiles(file, null, false); if (!list.isEmpty()) { try { File download = list.iterator().next(); byte[] bytes = FileUtils.readFileToByteArray(download); content = Base64.encodeBase64String(bytes); filename = download.getName(); mimeType = new Tika().detect(bytes, filename); int index = filename.lastIndexOf("."); if (index > -1 && index < filename.length()) { extension = filename.substring(index + 1).toLowerCase(); filename = filename.substring(0, index); } } catch (Throwable t) { Log.exception(t); } finally { for (File cur : list) { FileUtils.deleteQuietly(cur); } } } } } private static class SavedMedia { Map<String, String> encodedBytes = new LinkedHashMap<String, String>(); Map<String, String> mimeTypes = new LinkedHashMap<String, String>(); public SavedMedia(String body, HtmlNode[] patterns, boolean allMedia, int thread) { if (allMedia || !CommonUtil.isEmpty(patterns)) { Document doc = CommonUtil.parse(body, null, false); List<Element> elementsTmp = new ArrayList<Element>(doc.getElementsByAttribute("src")); List<Element> elements = new ArrayList<Element>(); if (!CommonUtil.isEmpty(patterns)) { for (Element element : elementsTmp) { for (int i = 0; i < patterns.length; i++) { if (NodeUtil.matches(patterns[i], element)) { elements.add(element); break; } } } } if (allMedia || !elements.isEmpty()) { try { File dir = new File("./media_cache" + thread); Collection<File> list = FileUtils.listFiles(dir, new String[] { "content" }, false); File savedMeta = null; for (File savedContent : list) { try { byte[] rawContent = FileUtils.readFileToByteArray(savedContent); String content = Base64.encodeBase64String(rawContent); savedMeta = new File(dir, savedContent.getName().split("\\.")[0] + ".metadata"); List<String> lines = FileUtils.readLines(savedMeta); String url = lines.get(0); String reportedMimeType = lines.size() >= 2 ? lines.get(1) : ""; String detectedMimeType = new Tika().detect(rawContent); String mimeType = !CommonUtil.isEmpty(reportedMimeType) && !reportedMimeType.toLowerCase().contains("octet") ? reportedMimeType : (!CommonUtil.isEmpty(detectedMimeType) ? detectedMimeType : reportedMimeType); List<String> sources = sources(url, elements); if (sources.isEmpty() && allMedia) { encodedBytes.put(url, content); mimeTypes.put(url, mimeType); } else { for (String src : sources) { encodedBytes.put(src, content); mimeTypes.put(src, mimeType); } } } catch (Throwable t) { Log.exception(t); } finally { FileUtils.deleteQuietly(savedContent); FileUtils.deleteQuietly(savedMeta); } } } catch (Throwable t) { Log.exception(t); } } } } private static List<String> sources(String url, List<Element> elements) { List<String> sources = new ArrayList<String>(); for (Element element : elements) { String src = element.attr("src"); if (!CommonUtil.isEmpty(src) && url.endsWith(src)) { sources.add(src); } } return sources; } } private static void fetch(Browser browser, Context context) throws ActionFailed { boolean terminate = false; try { String origHandle = browser.getWindowHandle(); String origUrl = browser.getCurrentUrl(); String newHandle = null; if (context.query.fetchCached) { newHandle = BrowserUtil.newWindow(browser, context.depth == 0); } try { for (int i = context.query.currentResult(); i < context.newResults.size(); i++) { if (context.query.requireResultAnchor && !isUrlValid(context.newResults.get(i).url) && UrlUtil.uriScheme.matcher(context.newResults.get(i).url).matches()) { context.newResults.get(i).close(); context.query.markResult(i + 1); continue; } if (ScreenSlicerBatch.isCancelled(context.req.runGuid)) { return; } Log.info("Fetching URL " + context.newResults.get(i).url + ". Cached: " + context.query.fetchCached, false); try { context.newResults.get(i).pageHtml = getHelper(browser, context.query.throttle, CommonUtil.parseFragment(context.newResults.get(i).urlNode, false), context.newResults.get(i).url, context.query.fetchCached, context.req.runGuid, context.query.fetchInNewWindow, context.depth == 0 && context.query == null, context.query == null ? null : context.query.postFetchClicks); //TODO get downloads and media for the results page, not just fetched pages DownloadedFiles downloaded = new DownloadedFiles(context.threadNum); context.newResults.get(i).pageBinary = downloaded.content; context.newResults.get(i).pageBinaryMimeType = downloaded.mimeType; context.newResults.get(i).pageBinaryExtension = downloaded.extension; context.newResults.get(i).pageBinaryFilename = downloaded.filename; SavedMedia media = new SavedMedia(context.newResults.get(i).pageHtml, context.query.media, context.query.allMedia, context.threadNum); context.newResults.get(i).mediaBinaries.putAll(media.encodedBytes); context.newResults.get(i).mediaMimeTypes.putAll(media.mimeTypes); if (!CommonUtil.isEmpty(context.newResults.get(i).pageHtml)) { try { context.newResults.get(i).pageText = NumWordsRulesExtractor.INSTANCE.getText(context.newResults.get(i).pageHtml); } catch (Throwable t) { context.newResults.get(i).pageText = null; Log.exception(t); } } if (context.recQuery != null) { context.recResults.addPage(scrape(context.recQuery, context.req, context.depth + 1, false, context.media, context.cache, context.threadNum)); } if (context.query.collapse) { context.newResults.get(i).close(); } context.query.markResult(i + 1); } catch (Browser.Retry r) { terminate = true; throw r; } catch (Browser.Fatal f) { terminate = true; throw f; } catch (Throwable t) { terminate = true; throw new ActionFailed(t); } try { if (!browser.getWindowHandle().equals(origHandle)) { browser.close(); browser.switchTo().window(origHandle); browser.switchTo().defaultContent(); } else if (!context.query.fetchInNewWindow) { BrowserUtil.get(browser, origUrl, true, context.depth == 0); SearchResults.revalidate(browser, false, context.threadNum); } } catch (Browser.Retry r) { terminate = true; throw r; } catch (Browser.Fatal f) { terminate = true; throw f; } catch (Throwable t) { terminate = true; throw new ActionFailed(t); } } } catch (Browser.Retry r) { terminate = true; throw r; } catch (Browser.Fatal f) { terminate = true; throw f; } catch (Throwable t) { terminate = true; throw new ActionFailed(t); } finally { if (!terminate) { if (!context.query.fetchInNewWindow || (context.query.fetchCached && origHandle.equals(newHandle))) { if (context.query.fetchInNewWindow) { Log.exception(new Throwable("Failed opening new window")); } BrowserUtil.get(browser, origUrl, true, context.depth == 0); } else { BrowserUtil.handleNewWindows(browser, origHandle, context.depth == 0); } } } } catch (Browser.Retry r) { terminate = true; throw r; } catch (Browser.Fatal f) { terminate = true; throw f; } catch (Throwable t) { terminate = true; throw new ActionFailed(t); } finally { if (!terminate) { BrowserUtil.browserSleepLong(context.query.throttle); } } } private static String getHelper(final Browser browser, final boolean throttle, final Node urlNode, final String url, final boolean p_cached, final String runGuid, final boolean toNewWindow, final boolean init, final HtmlNode[] postFetchClicks) { if (!CommonUtil.isEmpty(url) || urlNode != null) { final Object resultLock = new Object(); final String initVal; final String[] result; synchronized (resultLock) { initVal = Random.next(); result = new String[] { initVal }; } final AtomicBoolean started = new AtomicBoolean(); Thread thread = new Thread(new Runnable() { @Override public void run() { boolean terminate = false; started.set(true); boolean cached = p_cached; String newHandle = null; String origHandle = null; try { origHandle = browser.getWindowHandle(); String content = null; if (!cached) { try { BrowserUtil.get(browser, url, urlNode, false, toNewWindow, init); } catch (Browser.Retry r) { terminate = true; throw r; } catch (Browser.Fatal f) { terminate = true; throw f; } catch (Throwable t) { if (urlNode != null) { BrowserUtil.newWindow(browser, init); } BrowserUtil.get(browser, url, false, init); } if (urlNode != null) { newHandle = browser.getWindowHandle(); } BrowserUtil.doClicks(browser, postFetchClicks, null, null); content = browser.getPageSource(); if (WebApp.DEBUG && (postFetchClicks == null || postFetchClicks.length == 0)) { try { long filename = System.currentTimeMillis(); Files.copy(browser.getScreenshotAs(OutputType.FILE), new File("./" + filename + ".log.scrape.png")); FileUtils.writeStringToFile( new File("./" + filename + ".log.scrape.htm"), content, "utf-8"); } catch (IOException e) {} } if (CommonUtil.isEmpty(content)) { cached = true; } } if (cached) { if (ScreenSlicerBatch.isCancelled(runGuid)) { return; } try { BrowserUtil.get(browser, toCacheUrl(url, false), false, init); } catch (Browser.Retry r) { terminate = true; throw r; } catch (Browser.Fatal f) { terminate = true; throw f; } catch (Throwable t) { BrowserUtil.get(browser, toCacheUrl(url, true), false, init); } content = browser.getPageSource(); } content = NodeUtil.clean(content, browser.getCurrentUrl()).outerHtml(); //TODO make iframes work // if (!CommonUtil.isEmpty(content)) { // Document doc = Jsoup.parse(content); // Elements docFrames = doc.getElementsByTag("iframe"); // List<WebElement> iframes = browser.findElementsByTagName("iframe"); // int cur = 0; // for (WebElement iframe : iframes) { // try { // browser.switchTo().frame(iframe); // String frameSrc = browser.getPageSource(); // if (!CommonUtil.isEmpty(frameSrc) && cur < docFrames.size()) { // docFrames.get(cur).html( // Util.outerHtml(Jsoup.parse(frameSrc).body().childNodes())); // } // } catch (Throwable t) { // Log.exception(t); // } // ++cur; // } // browser.switchTo().defaultContent(); // content = doc.outerHtml(); // } synchronized (resultLock) { result[0] = content; } } catch (Browser.Retry r) { terminate = true; throw r; } catch (Browser.Fatal f) { terminate = true; throw f; } catch (Throwable t) { Log.exception(t); } finally { synchronized (resultLock) { if (initVal.equals(result[0])) { result[0] = null; } } if (!terminate) { BrowserUtil.browserSleepLong(throttle); if (init && newHandle != null && origHandle != null) { try { BrowserUtil.handleNewWindows(browser, origHandle, true); } catch (Browser.Retry r) { throw r; } catch (Browser.Fatal f) { throw f; } catch (Throwable t) { Log.exception(t); } } } } } }); thread.start(); try { while (!started.get()) { try { Thread.sleep(WAIT); } catch (Throwable t) {} } thread.join(HANG_TIME); synchronized (resultLock) { if (initVal.equals(result[0])) { Log.exception(new Exception("Browser is hanging")); try { thread.interrupt(); } catch (Throwable t) { Log.exception(t); } throw new Browser.Retry(); } return result[0]; } } catch (Browser.Retry r) { throw r; } catch (Browser.Fatal f) { throw f; } catch (Throwable t) { Log.exception(t); } } return null; } private static int getThread() { synchronized (doneLock) { while (true) { for (int i = 0; i < WebApp.THREADS; i++) { if (done[i]) { done[i] = false; return i; } } try { doneLock.wait(); } catch (InterruptedException e) {} } } } public static String get(Fetch fetch, Request req) { if (!isUrlValid(fetch.url)) { return null; } final int myThread = getThread(); if (!req.continueSession) { restart(req, !CommonUtil.isEmpty(fetch.media), myThread); } Log.info("Get URL " + fetch.url + ". Cached: " + fetch.fetchCached, false); String resp = ""; try { resp = getHelper(browsers.get()[myThread], fetch.throttle, null, fetch.url, fetch.fetchCached, req.runGuid, true, true, fetch.postFetchClicks); } catch (Browser.Retry r) { throw r; } catch (Browser.Fatal f) { throw f; } catch (Throwable t) { Log.exception(t); } finally { synchronized (doneLock) { done[myThread] = true; doneLock.notify(); } } return resp; } private static SearchResults filterResults(SearchResults results, String[] whitelist, String[] patterns, HtmlNode[] urlNodes, UrlTransform[] urlTransforms, boolean forExport) { if (results == null) { return SearchResults.newInstance(true); } SearchResults ret; results = UrlUtil.transformUrls(results, urlTransforms, forExport); if ((whitelist == null || whitelist.length == 0) && (patterns == null || patterns.length == 0) && (urlNodes == null || urlNodes.length == 0)) { ret = results; } else { List<Result> filtered = new ArrayList<Result>(); for (int i = 0; i < results.size(); i++) { if (!NodeUtil.isResultFiltered(results.get(i), whitelist, patterns, urlNodes)) { filtered.add(results.get(i)); } } if (filtered.isEmpty() && !results.isEmpty()) { Log.warn("Filtered every url, e.g., " + results.get(0).url); } ret = SearchResults.newInstance(true, filtered, results); } return ret; } public static List<HtmlNode> loadForm(FormLoad formLoad, Request req) throws ActionFailed { if (!isUrlValid(formLoad.site)) { return new ArrayList<HtmlNode>(); } final int myThread = getThread(); if (!req.continueSession) { restart(req, false, myThread); } try { List<HtmlNode> ret = null; try { ret = QueryForm.load(browsers.get()[myThread], formLoad, true); } catch (Browser.Retry r) { throw r; } catch (Browser.Fatal f) { throw f; } catch (Throwable t) { if (!req.continueSession) { restart(req, false, myThread); } ret = QueryForm.load(browsers.get()[myThread], formLoad, true); } return ret; } finally { synchronized (doneLock) { done[myThread] = true; doneLock.notify(); } } } private static class Context { private Request req; private Query query; private Query recQuery; private int page; private int depth; private SearchResults allResults; private SearchResults newResults; private SearchResults recResults; private List<String> resultPages; private boolean media; private Map<String, Object> cache; private int threadNum; } private static void handlePage(Context context) throws ActionFailed, End { if (context.query.extract) { if (context.newResults.isEmpty()) { SearchResults tmpResults; try { tmpResults = ProcessPage.perform(browsers.get()[context.threadNum], context.page, context.query, context.threadNum); } catch (Browser.Retry r) { SearchResults.revalidate(browsers.get()[context.threadNum], true, context.threadNum); tmpResults = ProcessPage.perform(browsers.get()[context.threadNum], context.page, context.query, context.threadNum); } tmpResults = filterResults(tmpResults, context.query.urlWhitelist, context.query.urlPatterns, context.query.urlMatchNodes, context.query.urlTransforms, false); if (context.allResults.isDuplicatePage(tmpResults)) { throw new End(); } if (context.query.results > 0 && context.allResults.size() + tmpResults.size() > context.query.results) { int remove = context.allResults.size() + tmpResults.size() - context.query.results; for (int i = 0; i < remove && !tmpResults.isEmpty(); i++) { tmpResults.remove(tmpResults.size() - 1); } } context.newResults.addPage(tmpResults); } if (context.query.fetch) { fetch(browsers.get()[context.threadNum], context); } if (context.query.collapse) { for (int i = 0; i < context.newResults.size(); i++) { context.newResults.get(i).close(); } } context.allResults.addPage(context.newResults); } else { context.resultPages.add(NodeUtil.clean(browsers.get()[context.threadNum].getPageSource(), browsers.get()[context.threadNum].getCurrentUrl()).outerHtml()); } } public static List<Result> scrape(Query query, Request req) { final int myThread = getThread(); boolean media = hasMedia(query); if (!req.continueSession) { restart(req, media, myThread); } try { Map<String, Object> cache = new HashMap<String, Object>(); SearchResults ret = null; for (int i = 0; i < MAX_INIT; i++) { try { ret = scrape(query, req, 0, i + 1 == MAX_INIT, media, cache, myThread); Log.info("Scrape finished"); List<Result> searchResults = ret.drain(); for (Result result : searchResults) { if (result.isClosed()) { Result.addHold(); } } return searchResults; } catch (Browser.Fatal f) { Log.exception(f); Log.warn("Reinitializing state and resuming scrape..."); restart(req, media, myThread); } } return null; } finally { synchronized (doneLock) { done[myThread] = true; doneLock.notify(); } } } private static boolean hasMedia(Query query) { Query cur = query; while (cur != null) { if (!CommonUtil.isEmpty(cur.media) || cur.allMedia) { return true; } cur = query.keywordQuery == null ? query.formQuery : query.keywordQuery; } return false; } private static SearchResults scrape(Query query, Request req, int depth, boolean fallback, boolean media, Map<String, Object> cache, int threadNum) { Context context = new Context(); context.req = req; context.query = query; context.recQuery = query.keywordQuery == null ? (query.formQuery == null ? null : query.formQuery) : query.keywordQuery; context.depth = depth; context.media = media; context.cache = cache; context.threadNum = threadNum; if (cache.containsKey(Integer.toString(depth))) { Map<String, Object> curCache = (Map<String, Object>) cache.get(Integer.toString(depth)); context.allResults = (SearchResults) curCache.get("results"); context.recResults = (SearchResults) curCache.get("recResults"); context.resultPages = (List<String>) curCache.get("resultPages"); } else { Map<String, Object> curCache = new HashMap<String, Object>(); cache.put(Integer.toString(depth), curCache); context.allResults = SearchResults.newInstance(false); curCache.put("results", context.allResults); context.recResults = SearchResults.newInstance(false); curCache.put("recResults", context.recResults); context.resultPages = new ArrayList<String>(); curCache.put("resultPages", context.resultPages); } try { if (ScreenSlicerBatch.isCancelled(req.runGuid)) { throw new Cancelled(); } if (query.isFormQuery()) { Log.info("FormQuery for URL " + query.site, false); try { QueryForm.perform(browsers.get()[threadNum], (FormQuery) query, depth == 0); } catch (Throwable e) { if (depth == 0) { restart(req, media, threadNum); } QueryForm.perform(browsers.get()[threadNum], (FormQuery) query, depth == 0); } } else { Log.info("KewordQuery for URL " + query.site + ". Query: " + ((KeywordQuery) query).keywords, false); try { QueryKeyword.perform(browsers.get()[threadNum], (KeywordQuery) query, depth == 0); } catch (Throwable e) { if (depth == 0) { restart(req, media, threadNum); } QueryKeyword.perform(browsers.get()[threadNum], (KeywordQuery) query, depth == 0); } } if (ScreenSlicerBatch.isCancelled(req.runGuid)) { throw new Cancelled(); } String priorProceedLabel = null; for (int page = 1; (page <= query.pages || query.pages <= 0) && (context.allResults.size() < query.results || query.results <= 0); page++) { if (ScreenSlicerBatch.isCancelled(req.runGuid)) { throw new Cancelled(); } if (page > 1) { if (!query.fetch) { try { BrowserUtil.browserSleepLong(query.throttle); } catch (Throwable t) { Log.exception(t); } } Log.info("Proceeding to page " + page); try { priorProceedLabel = Proceed.perform(browsers.get()[threadNum], query.proceedClicks, page, priorProceedLabel); } catch (Browser.Retry r) { SearchResults.revalidate(browsers.get()[threadNum], true, threadNum); priorProceedLabel = Proceed.perform(browsers.get()[threadNum], query.proceedClicks, page, priorProceedLabel); } if (ScreenSlicerBatch.isCancelled(req.runGuid)) { throw new Cancelled(); } } if (query.currentPage() + 1 == page) { context.page = page; context.newResults = SearchResults.newInstance(true); try { handlePage(context); } catch (Browser.Retry r) { SearchResults.revalidate(browsers.get()[threadNum], true, threadNum); handlePage(context); } query.markPage(page); query.markResult(0); } } query.markPage(0); } catch (End e) { Log.info("Reached end of results", false); } catch (Cancelled c) { Log.info("Cancellation requested."); } catch (Throwable t) { if (fallback) { Log.warn("Too many errors. Finishing scrape..."); } else { throw new Browser.Fatal(t); } } cache.remove(Integer.toString(depth)); if (query.extract) { if (context.recResults.isEmpty()) { return filterResults(context.allResults, query.urlWhitelist, query.urlPatterns, query.urlMatchNodes, query.urlTransforms, true); } if (query.collapse) { for (int i = 0; i < context.allResults.size(); i++) { context.allResults.get(i).remove(); } } return context.recResults; } List<Result> pages = new ArrayList<Result>(); for (String page : context.resultPages) { Result r = new Result(); r.html = page; pages.add(r); } return SearchResults.newInstance(false, pages, null); } private static boolean isUrlValid(String url) { return !CommonUtil.isEmpty(url) && (url.startsWith("https://") || url.startsWith("http://")); } public static List<Result> scrape(String url, final String query, final int pages, final String mapKey1, final String mapKey2) { if (!isUrlValid(url)) { return new ArrayList<Result>(); } synchronized (doneLock) { if (!done[0]) { return null; } done[0] = false; } restart(new Request(), false, 0); List<Result> results = new ArrayList<Result>(); final KeywordQuery keywordQuery = new KeywordQuery(); try { synchronized (progressLock) { progress1Key = mapKey1; progress2Key = mapKey2; progress1 = "Page 1 progress: performing search query..."; progress2 = "Page 2 progress: waiting for prior page extraction to finish..."; } push(mapKey1, null); keywordQuery.site = url; keywordQuery.keywords = query; QueryKeyword.perform(browsers.get()[0], keywordQuery, true); synchronized (progressLock) { progress1 = "Page 1 progress: extracting results..."; } results.addAll(ProcessPage.perform(browsers.get()[0], 1, keywordQuery, 0).drain()); synchronized (progressLock) { progress1 = ""; } } catch (Throwable t) { Log.exception(t); push(mapKey1, results); synchronized (progressLock) { progress1 = ""; progress2 = "Page 2 progress: prior page extraction was not completed."; } synchronized (doneLock) { done[0] = true; } return results; } try { push(mapKey2, null); push(mapKey1, results); } catch (Throwable t) { Log.exception(t); synchronized (progressLock) { progress1 = ""; progress2 = "Page 2 progress: prior page extraction was not completed."; } synchronized (doneLock) { done[0] = true; } return results; } new Thread(new Runnable() { @Override public void run() { List<Result> next = new ArrayList<Result>(); try { synchronized (progressLock) { progress2 = "Page 2 progress: getting page..."; } Proceed.perform(browsers.get()[0], null, 2, query); synchronized (progressLock) { progress2 = "Page 2 progress: extracting results..."; } next.addAll(ProcessPage.perform(browsers.get()[0], 2, keywordQuery, 0).drain()); } catch (End e) { Log.info("Reached end of results", false); } catch (Throwable t) { Log.exception(t); } finally { push(mapKey2, next); synchronized (progressLock) { progress2 = ""; } synchronized (doneLock) { done[0] = true; } } } }).start(); return results; } }